library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.3.2
library(htmlwidgets)
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(units)
## udunits database from /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library/units/share/udunits/udunits2.xml
library(dplyr)
Part A. 1.
data.dir = gsub('/code', '/data', getwd())
neighborhoods= st_read(paste0(data.dir, '/hw1-data/Toronto_Neighbourhoods/Toronto_Neighbourhoods.shp'))
## Reading layer `Toronto_Neighbourhoods' from data source
## `/Users/davegong/Desktop/year4 fall/sta465/HW1/hw1-data/Toronto_Neighbourhoods/Toronto_Neighbourhoods.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 140 features and 39 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -79.63926 ymin: 43.581 xmax: -79.11524 ymax: 43.85546
## Geodetic CRS: WGS 84
subway_lines= st_read(paste0(data.dir, '/hw1-data/ttc-subway-shapefile-wgs84/TTC_SUBWAY_LINES_WGS84.shp'))
## Reading layer `TTC_SUBWAY_LINES_WGS84' from data source
## `/Users/davegong/Desktop/year4 fall/sta465/HW1/hw1-data/ttc-subway-shapefile-wgs84/TTC_SUBWAY_LINES_WGS84.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 4 features and 3 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: -79.5354 ymin: 43.63781 xmax: -79.25107 ymax: 43.79677
## Geodetic CRS: WGS 84
subway_lines_wgs84 <- st_transform(subway_lines, crs = 4326)
neighborhoods_wgs84 <- st_transform(neighborhoods, crs = 4326)
crime_sites = read.csv(paste0(data.dir, "/hw1-data/toronto-crimes-06-2024.csv")) %>%
mutate(REPORT_DATE = as.Date(REPORT_DATE))
Write-up: Toronto_Neighbourhoods.shp is Polygons/Multipolygons TTC_SUBWAY_LINES_WGS84.shp is Polylines toronto-crimes-06-2024.csv is points/multipoints
.shp: main shapefile .shx: spatial index .dbf: dBase file which stores the non-spatial columns .prj: definition of the projection of the spatial data
head(neighborhoods)
## Simple feature collection with 6 features and 39 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -79.52344 ymin: 43.62049 xmax: -79.28461 ymax: 43.79607
## Geodetic CRS: WGS 84
## OBJECTID Neighbourh Total_Area Total_Popu Pop_Males Pop_Female
## 1 1 Yonge-St.Clair 1.2 11655 5235 6420
## 2 2 York University Heights 13.2 27715 13580 14125
## 3 3 Lansing-Westgate 5.4 14640 6950 7695
## 4 4 Yorkdale-Glen Park 5.9 14685 6750 7940
## 5 5 Stonegate-Queensway 7.9 24690 11935 12745
## 6 6 Tam O'Shanter-Sullivan 5.5 27390 12840 14555
## Pop0_4year Pop5_9year Pop10_14ye Pop15_19ye Pop20_24ye Pop25_29ye Pop30_34ye
## 1 395 345 235 285 645 1355 1125
## 2 1645 1385 1380 1750 3280 2815 2120
## 3 845 725 745 795 965 1185 1290
## 4 640 690 815 835 845 890 895
## 5 1395 1235 1265 1295 1190 1270 1575
## 6 1460 1380 1370 1560 1765 1665 1605
## Pop35_39ye Pop40_44ye Pop45_49ye Pop50_54ye Pop55_59ye Pop60_64ye Pop65_69ye
## 1 900 860 755 715 760 820 755
## 2 1800 1820 1940 1750 1335 1150 870
## 3 1170 1175 1150 1035 855 750 515
## 4 860 1050 1085 1030 850 765 610
## 5 1865 1990 2190 2120 1790 1515 940
## 6 1850 1965 2230 1925 1655 1435 1235
## Pop70_74ye Pop75_79ye Pop80_84ye Pop85years Seniors55a Seniors65a Child0_14
## 1 525 445 365 395 4075 2505 985
## 2 890 825 545 395 6010 3545 4405
## 3 405 360 330 360 3565 1960 2300
## 4 690 755 690 700 5070 3450 2140
## 5 830 755 635 825 7295 3985 3895
## 6 1220 1190 945 970 8595 5520 4170
## Youth15_24 HomeLangua Language_C Language_I Language_K Language_P Language_1
## 1 925 11445 250 120 55 120 55
## 2 5040 26340 1750 2280 265 370 285
## 3 1755 14285 995 165 580 630 90
## 4 1690 13685 640 2950 50 70 810
## 5 2465 23980 330 755 135 65 555
## 6 3305 26195 8220 340 170 420 70
## Language_R Language_S Language_T Language_2 Language_U
## 1 180 230 95 10 10
## 2 430 1475 930 1175 580
## 3 770 320 460 15 45
## 4 85 625 605 120 60
## 5 635 455 260 25 35
## 6 65 315 820 1150 780
## geometry
## 1 POLYGON ((-79.39119 43.6810...
## 2 POLYGON ((-79.50529 43.7598...
## 3 POLYGON ((-79.43998 43.7615...
## 4 POLYGON ((-79.43969 43.7056...
## 5 POLYGON ((-79.49262 43.6474...
## 6 POLYGON ((-79.31979 43.7683...
head(subway_lines)
## Simple feature collection with 4 features and 3 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: -79.5354 ymin: 43.63781 xmax: -79.25107 ymax: 43.79677
## Geodetic CRS: WGS 84
## OBJECTID ROUTE_NAME RID geometry
## 1 53420 LINE 1 (YONGE-UNIVERSITY) 1 LINESTRING (-79.52813 43.79...
## 2 53421 LINE 2 (BLOOR - DANFORTH) 2 LINESTRING (-79.5354 43.637...
## 3 53422 LINE 3 (SCARBOROUGH) 3 LINESTRING (-79.26332 43.73...
## 4 53423 LINE 4 (SHEPPARD) 4 LINESTRING (-79.41113 43.76...
head(crime_sites)
## X Y OBJECTID EVENT_UNIQUE_ID REPORT_DATE OCC_DATE
## 1 -8855488 5416241 392761 GO-20241185211 2024-06-01 2024/06/01 05:00:00+00
## 2 -8840321 5412794 392762 GO-20241184252 2024-06-01 2024/06/01 05:00:00+00
## 3 -8829780 5425476 392763 GO-20241184623 2024-06-01 2024/06/01 05:00:00+00
## 4 -8851608 5406477 392764 GO-20241184531 2024-06-01 2024/06/01 05:00:00+00
## 5 -8851608 5406477 392765 GO-20241184531 2024-06-01 2024/06/01 05:00:00+00
## 6 -8819720 5439058 392766 GO-20241191208 2024-06-01 2024/06/01 05:00:00+00
## REPORT_YEAR REPORT_MONTH REPORT_DAY REPORT_DOY REPORT_DOW REPORT_HOUR
## 1 2024 June 1 153 Saturday 7
## 2 2024 June 1 153 Saturday 2
## 3 2024 June 1 153 Saturday 3
## 4 2024 June 1 153 Saturday 3
## 5 2024 June 1 153 Saturday 3
## 6 2024 June 1 153 Saturday 23
## OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY OCC_DOW OCC_HOUR DIVISION
## 1 2024 June 1 153 Saturday 1 D23
## 2 2024 June 1 153 Saturday 2 D14
## 3 2024 June 1 153 Saturday 3 D33
## 4 2024 June 1 153 Saturday 3 D22
## 5 2024 June 1 153 Saturday 3 D22
## 6 2024 June 1 153 Saturday 23 D42
## LOCATION_TYPE
## 1 Single Home, House (Attach Garage, Cottage, Mobile)
## 2 Schools During Un-Supervised Activity
## 3 Other Commercial / Corporate Places (For Profit, Warehouse, Corp. Bldg
## 4 Dealership (Car, Motorcycle, Marine, Trailer, Etc.)
## 5 Dealership (Car, Motorcycle, Marine, Trailer, Etc.)
## 6 Single Home, House (Attach Garage, Cottage, Mobile)
## PREMISES_TYPE UCR_CODE UCR_EXT OFFENCE MCI_CATEGORY
## 1 House 2135 210 Theft Of Motor Vehicle Auto Theft
## 2 Educational 1610 200 Robbery - Mugging Robbery
## 3 Commercial 2120 200 B&E Break and Enter
## 4 Commercial 2120 200 B&E Break and Enter
## 5 Commercial 2135 210 Theft Of Motor Vehicle Auto Theft
## 6 House 2120 200 B&E Break and Enter
## HOOD_158 NEIGHBOURHOOD_158 HOOD_140
## 1 007 Willowridge-Martingrove-Richview (7) 007
## 2 080 Palmerston-Little Italy (80) 080
## 3 043 Victoria Village (43) 043
## 4 160 Mimico-Queensway (160) 017
## 5 160 Mimico-Queensway (160) 017
## 6 144 Morningside Heights (144) 131
## NEIGHBOURHOOD_140 LONG_WGS84 LAT_WGS84
## 1 Willowridge-Martingrove-Richview (7) -79.55021 43.68121
## 2 Palmerston-Little Italy (80) -79.41395 43.65881
## 3 Victoria Village (43) -79.31926 43.74118
## 4 Mimico (includes Humber Bay Shores) (17) -79.51535 43.61775
## 5 Mimico (includes Humber Bay Shores) (17) -79.51535 43.61775
## 6 Rouge (131) -79.22889 43.82926
#transform csv into shapefile
crime_sites = st_as_sf(crime_sites,
coords=c('LONG_WGS84','LAT_WGS84'),
crs=4326, # projection
remove=TRUE) # remove coordinate columns
crime_sites_wgs84 <- st_transform(crime_sites, crs = 4326)
leaflet() %>%
# Add CartoDB.Positron layer
addProviderTiles('CartoDB.Positron') %>%
addCircles(data = crime_sites,
color = 'firebrick',
fillOpacity = 1,
opacity = 1,
radius = 25,
label = ~MCI_CATEGORY, # Assuming you have a column for crime descriptions/labels
popup = ~UCR_CODE, # Popup for more details
group = "Crime Sites") %>%
# Add subway lines
addPolylines(data = subway_lines,
color = ~case_when( # Conditional color based on route name
ROUTE_NAME == "LINE 1 (YONGE-UNIVERSITY)" ~ "pink",
ROUTE_NAME == "LINE 2 (BLOOR - DANFORTH)" ~ "green",
ROUTE_NAME == "LINE 3 (SCARBOROUGH)" ~ "blue",
ROUTE_NAME == "LINE 4 (SHEPPARD)" ~ "yellow",
TRUE ~ "black"), # Default color
label = ~ROUTE_NAME, # Label the subway routes
popup = ~ROUTE_NAME,
group = "Subway Lines") %>%
# Add neighborhood polygons
addPolygons(data = neighborhoods,
fillColor = "gray", # Add a default color for neighborhoods
fillOpacity = 0.5,
color = "black",
weight = 1,
label = ~Neighbourh,
popup = ~Neighbourh,
group = "Neighborhoods") %>%
addLayersControl(
overlayGroups = c("Crime Sites", "Subway Lines", "Neighborhoods"),
options = layersControlOptions(collapsed = FALSE)
)
Part A. 2.
st_crs(crime_sites)
## Coordinate Reference System:
## User input: EPSG:4326
## wkt:
## GEOGCRS["WGS 84",
## ENSEMBLE["World Geodetic System 1984 ensemble",
## MEMBER["World Geodetic System 1984 (Transit)"],
## MEMBER["World Geodetic System 1984 (G730)"],
## MEMBER["World Geodetic System 1984 (G873)"],
## MEMBER["World Geodetic System 1984 (G1150)"],
## MEMBER["World Geodetic System 1984 (G1674)"],
## MEMBER["World Geodetic System 1984 (G1762)"],
## MEMBER["World Geodetic System 1984 (G2139)"],
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]],
## ENSEMBLEACCURACY[2.0]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## USAGE[
## SCOPE["Horizontal component of 3D system."],
## AREA["World."],
## BBOX[-90,-180,90,180]],
## ID["EPSG",4326]]
st_crs(subway_lines)
## Coordinate Reference System:
## User input: WGS 84
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["latitude",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["longitude",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]]
st_crs(neighborhoods)
## Coordinate Reference System:
## User input: WGS 84
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["latitude",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["longitude",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]]
Write-Up: (1) They all have datum as WGS 84. WGS 84 uses an ellipsoid (a mathematically defined, flattened sphere) to approximate the shape of the Earth. The specific ellipsoid used is called the WGS 84 ellipsoid. It uses an origin at the center of the Earth for defining locations. WGS 84 uses a geographic coordinate system based on latitude, longitude, and altitude. These coordinates are expressed in degrees for latitude and longitude, and meters for altitude. (2) The datum defines an origin point of the coordinate axes and the direction of the axes. (3) Yes, there is a need to apply a single datum type to all 3 datasets, so that the the type of ellipsoid model, Reference Frame and coordinate System are the same.
Part A.3 Projection affects spatial operations such as calculating distance and area. Operations between spatial objects also require them to be in the same projection. And UTM projection minimizes distortion over smaller areas.
crime_sites <- st_transform(crime_sites, crs = 32617)
subway_lines <- st_transform(subway_lines, crs = 32617)
neighborhoods <- st_transform(neighborhoods, crs = 32617)
Part A.4
utm17.area = neighborhoods %>%
st_area(geometry) %>% set_units(km^2)
tibble(
neighborhood = neighborhoods$Neighbourh,
area_km2 = as.numeric(utm17.area),
population = neighborhoods$Total_Popu
)
## # A tibble: 140 × 3
## neighborhood area_km2 population
## <chr> <dbl> <int>
## 1 Yonge-St.Clair 1.16 11655
## 2 York University Heights 13.2 27715
## 3 Lansing-Westgate 5.35 14640
## 4 Yorkdale-Glen Park 6.04 14685
## 5 Stonegate-Queensway 7.95 24690
## 6 Tam O'Shanter-Sullivan 5.42 27390
## 7 The Beaches 3.60 21135
## 8 Thistletown-Beaumond Heights 3.34 10140
## 9 Thorncliffe Park 3.13 19225
## 10 Danforth-East York 2.19 16710
## # ℹ 130 more rows
plot(data.frame(utm17.area, neighborhoods$Total_Popu))
cor(as.numeric(utm17.area), as.numeric(neighborhoods$Total_Popu), use = "complete.obs")
## [1] 0.6205686
Write-Up: From the scatter plot, we can see there is a positive correlation (0.6205686) between the area and population. This means population will increase as area increases.
Part A.5
#spatial join of crimes to neighborhoods
neighborhood_crimes <- st_join(neighborhoods, crime_sites, join = st_intersects)
neighborhood_crimes <- st_transform(neighborhood_crimes, crs = 4326)
# Summarize the number of crimes per Neighbourh
crime.neighor = neighborhood_crimes %>% as_tibble() %>%
group_by(Neighbourh) %>%
summarise(total_crimes=n())
population = neighborhoods$Total_Popu # Sum of population columns (adjust names as needed)
crime_rate_per_capita = crime.neighor$total_crimes / population
crime_summary <- tibble(crime.neighor,population=population,crime_rate_per_capita=crime_rate_per_capita )
Merged_Neighbor_crimes = merge(neighborhoods_wgs84, crime.neighor, by='Neighbourh')
Merged_Neighbor_crimes = merge(Merged_Neighbor_crimes, crime_summary, by='Neighbourh')
#5(a):
crime.pal = colorNumeric(palette = "YlOrRd",
domain=Merged_Neighbor_crimes$total_crimes.x)
leaflet(Merged_Neighbor_crimes) %>%
addProviderTiles('CartoDB.Positron') %>%
addPolygons(weight=0, fillOpacity=.8, color=~crime.pal(Merged_Neighbor_crimes$total_crimes.x),
label=~paste0(Neighbourh, ": ", total_crimes.x)) %>%
addLegend(pal = crime.pal,
values = Merged_Neighbor_crimes$total_crimes.x,
title = "Total Crimes",
position = "bottomright")
#5(b):
crime_rate.pal = colorNumeric(palette = "YlOrRd",
domain=Merged_Neighbor_crimes$crime_rate_per_capita)
leaflet(Merged_Neighbor_crimes) %>%
addProviderTiles('CartoDB.Positron') %>%
addPolygons(weight=0, fillOpacity=.8, color=~crime_rate.pal(Merged_Neighbor_crimes$crime_rate_per_capita),
label=~paste0(Merged_Neighbor_crimes$Neighbourh, ": ", Merged_Neighbor_crimes$crime_rate_per_capita)) %>%
addLegend(pal = crime_rate.pal,
values = Merged_Neighbor_crimes$crime_rate_per_capita,
title = "Total Crimes",
position = "bottomright")
From the map showing the counts of crimes in each neighborhood, we can see that Waterfront Communities-The Island have the most number of crimes. And the neighborhood around these this neighborhoods have more crimes compared to others. Also, we can see there are more crimes on the west than the east.
From the map showing the per capita crime rate, we can see that West Humber-clairville have the highest rates. And the neighborhood around these this neighborhoods have higher crimes rates compared to others. Similarly, we can see there are higher crime rate on the west than the east.
Part A.6
subway_buffer <- st_buffer(subway_lines, dist = 1000)
subway_buffer_wgs84 <- st_transform(subway_buffer, crs = 4326)
#Create the leaflet map with the CartoDB tiles
leaflet() %>%
addProviderTiles('CartoDB.Positron') %>%
addPolylines(data = subway_lines_wgs84, color = "blue", weight = 2, opacity = 0.8) %>%
addPolygons(data = subway_buffer_wgs84, fillColor = "red", fillOpacity = 0.3, color = "red", weight = 1) %>%
addCircles(data = crime_sites_wgs84, color = 'black', fillOpacity = 0.1, radius = 25) %>%
addLegend("bottomright", colors = "red", labels = "1 km Buffer", title = "Legend")
crimes_in_buffer <- st_join(subway_buffer_wgs84, crime_sites_wgs84, join = st_intersects)
crime_summary_table <- crimes_in_buffer %>%
as_tibble() %>%
group_by(ROUTE_NAME) %>%
summarise(total_crimes_in_buffer = n()) %>%
mutate(proportion_of_crimes_occured_within_buffer = total_crimes_in_buffer / nrow(crime_sites))
crime_summary_table
## # A tibble: 4 × 3
## ROUTE_NAME total_crimes_in_buffer proportion_of_crimes_occure…¹
## <chr> <int> <dbl>
## 1 LINE 1 (YONGE-UNIVERSITY) 1006 0.256
## 2 LINE 2 (BLOOR - DANFORTH) 683 0.174
## 3 LINE 3 (SCARBOROUGH) 111 0.0283
## 4 LINE 4 (SHEPPARD) 79 0.0201
## # ℹ abbreviated name: ¹​proportion_of_crimes_occured_within_buffer
We can see the proportion of the crimes occurred within 1 km subway line 1 is nearly 25.6%, and the proportion of the crimes occurred within 1 km subway line 1 is nearly 17.4%.
Part A.7
#(a)The following neighborhoods has subway
neighborhoods_subway_or_not = st_join(neighborhoods, subway_lines, join = st_intersects)
neighborhoods_subway_or_not <- neighborhoods_subway_or_not %>% distinct(Neighbourh, .keep_all = TRUE)
neighborhoods_subway= filter(neighborhoods_subway_or_not, ROUTE_NAME!='NA')
neighborhoods_without_subway= filter(neighborhoods_subway_or_not, is.na(ROUTE_NAME))
print(neighborhoods_subway$Neighbourh)
## [1] "Yonge-St.Clair" "York University Heights"
## [3] "Lansing-Westgate" "Yorkdale-Glen Park"
## [5] "Danforth-East York" "Humewood-Cedarvale"
## [7] "Islington-City Centre West" "Danforth"
## [9] "St.Andrew-Windfields" "Taylor-Massey"
## [11] "Church-Yonge Corridor" "Clairlea-Birchmount"
## [13] "Ionview" "North Riverdale"
## [15] "Forest Hill North" "Waterfront Communities-The Island"
## [17] "Kennedy Park" "Clanton Park"
## [19] "Casa Loma" "Kensington-Chinatown"
## [21] "Kingsway South" "Runnymede-Bloor West Village"
## [23] "Forest Hill South" "Henry Farm"
## [25] "Annex" "University"
## [27] "Dorset Park" "Dovercourt-Wallace Emerson-Juncti"
## [29] "Newtonbrook West" "High Park North"
## [31] "High Park-Swansea" "North St.James Town"
## [33] "Oakridge" "Rosedale-Moore Park"
## [35] "Bathurst Manor" "Bendale"
## [37] "Downsview-Roding-CFB" "Lambton Baby Point"
## [39] "Bay Street Corridor" "Willowdale East"
## [41] "Willowdale West" "Cabbagetown-South St.James Town"
## [43] "East End-Danforth" "Playter Estates-Danforth"
## [45] "Woburn" "Woodbine-Lumsden"
## [47] "Bayview Village" "Bedford Park-Nortown"
## [49] "Bridle Path-Sunnybrook-York Mills" "Lawrence Park South"
## [51] "Lawrence Park North" "Yonge-Eglinton"
#(b)
neighborhoods_subway_crime <- st_join(neighborhoods_subway, crime_sites, join = st_intersects)
neighborhoods_wihtout_subway_crime <- st_join(neighborhoods_without_subway, crime_sites, join = st_intersects)
neighborhoods_subway_crime_table<-neighborhoods_subway_crime %>% as_tibble() %>%
group_by(Neighbourh) %>%
summarise(total_crimes=n())
neighborhoods_without_subway_crime_table<-neighborhoods_wihtout_subway_crime %>% as_tibble() %>%
group_by(Neighbourh) %>%
summarise(total_crimes=n())
print(mean(neighborhoods_subway_crime_table$total_crimes))
## [1] 34.40385
print(mean(neighborhoods_without_subway_crime_table$total_crimes))
## [1] 24.10227
We can see the average of crimes in the neighborhoods with subway is higher than the average of crimes in the neighborhoods without subway.
Part B.1
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(sf)
library(geoR)
## Warning: package 'geoR' was built under R version 4.3.2
## --------------------------------------------------------------
## Analysis of Geostatistical Data
## For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
## geoR version 1.9-4 (built on 2024-02-14) is now loaded
## --------------------------------------------------------------
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Loading required package: spam
## Spam version 2.10-0 (2023-10-23) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
##
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
##
## Loading required package: viridisLite
##
## Try help(fields) to get started.
##
## Attaching package: 'fields'
##
## The following object is masked from 'package:leaflet':
##
## addLegend
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(mapdata)
library(units)
Part B (1)(a)
# Set parameter values
h <- seq(1e-5, 15, length.out = 10000)
tau2 <- 0.5
sigma2 <- 4
phi <- 6
lambda <- 0.5
# Define the semivariogram functions
linear <- tau2 + sigma2 * h
power_model <- tau2 + sigma2 * (h**lambda)
rational_quadratic <- tau2 + sigma2 * (h**2 / (1 + phi * (h**2)))
powered_exponential <- tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda))
# Handle the Wave semivariogram
wave <- tau2 + sigma2 * (1 - sin(phi * h) / (phi * h))
# Plot the semivariograms
plot(h, linear, type = "l", col = "red", ylim = c(0, 10), ylab = "Semivariance", xlab = "Distance h")
lines(h, power_model, col = "blue")
lines(h, rational_quadratic, col = "green")
lines(h, powered_exponential, col = "purple")
lines(h, wave, col = "orange")
abline(h = sigma2+tau2, col = "black")
legend("topright", legend = c("Linear", "Power model", "Rational quadratic", "Powered exponential", "Wave"), col = c("red", "blue", "green", "purple", "orange"), lty = 1, cex = 0.7)
Part B (1)(b)
1.Linear: The semivariance increases linearly, with the increase of distance. The value of semivariance exceed the sill as h(distance) increase to a certain points. 2.Power Model: The semivariance increases with a decreasing slope, with the increase of distance. Similarly, the value of semivariance exceed the sill as h(distance) increase to a certain points. 3.Rational Quadratic: Like power model, the semivariance increases with a decreasing slope. However, the maximum value of semivariance in rational Quadratic model is far smaller than the sill. Semivariance in rational Quadratic model can never reach the sill with the increase of h. 4.Powered Exponential: The semivariance increases with a decreasing slope, with the increase of distance. The value of semivariance approach closely to the sill, when h approaches closely to the range(phi). 5.Wave: At first, semivariance increases as h increase. Then after a certain point, the The value of semivariance oscillates around the sill. Then, as h increases further, the amplitude becomes smaller.
The Powered Exponential and wave model are stationary: As the semivariance approaches to the sill as h increases. This indicates that the spatial correlation between points becomes constant beyond a certain distance. Also it means covariance(C(h)) goes to 0 as h goes to infinity. So C(h) exists. So stationarity is satified.
The line model and power model are not stationary. The value of semivariance increase indefinitely as h(distance) increase.
The Rational Quadratic model is stationary. Because the semivariance becomes more and more flatter as h increases. However, the asymptote that semivariance can reach is smaller the real sill, meaning the variance beyond the range is smaller.
Part B (1)(c)
lambda_values <- c(0.5, 1, 2)
plot(h, tau2 + sigma2 * h^lambda_values[1], type = "l", col = "blue", ylim = c(0, 20),
ylab = "Semivariance", xlab = "Distance h", main = "Power Models")
lines(h,tau2 + sigma2 * h^lambda_values[2], col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * h^lambda_values[3], col = "darkgreen", lwd = 1)
legend("topright", legend = c("lambda = 0.5", "lambda = 1", "lambda = 2"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
The parameter lambda_values represents the slope of the curves. In other
words, λ controls how quickly semivariance increases as the distance
increases. For example, (1)when lamda=1, the power model is the same as
linear function. (2)When lamda > 1, the semivariance increases with
increasing slope as h increase. The increase becomes steeper. The first
derivative keeps increasing. (3)When lamda < 1, the semivariance
increases with decreasing slop as h increase. The increase is more
gradual. The first derivative keeps decreasing
#(c) continued
lambda_values <- c(0.5, 1, 2)
plot(h, tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[1])), type = "l", col = "blue", ylim = c(0, 5),
ylab = "Semivariance", xlab = "Distance h", main = "Powered Exponential Models")
lines(h,tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[2])), col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[3])), col = "darkgreen", lwd = 1)
legend("topright", legend = c("lambda = 0.5", "lambda = 1", "lambda = 2"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
The parameter lambda represents: how fast the semivariance move close to
the sill. We can see: The larger the value of lambda is, the faster the
semivariance approach to the sill. But no matter what the value of lamda
is, the semivariance increases with increasing slope as h increase. The
increase becomes steeper.
Part B (1)(d) ######### rational_quadratic #########
tau2_values <- c(0.5, 1, 2)
sigma2_values <- c(4,5,6)
phi_values <- c(6, 10, 14)
rational_quadratic <- tau2 + sigma2 * (h^2 / (1 + phi * h^2))
#rational_quadratic with changing tau
plot(h, tau2_values[1] + sigma2 * (h^2 / (1 + phi * h^2)), type = "l", col = "blue", ylim = c(0, 5),
ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing tau")
lines(h,tau2_values[2] + sigma2 * (h^2 / (1 + phi * h^2)), col = "purple", lwd = 1)
lines(h,tau2_values[3] + sigma2 * (h^2 / (1 + phi * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("tau2 = 0.5", "tau2 = 1", "tau2 = 2"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
tau(nugget) represent the discontinuity at the origin. The parameter tau
can move the curves vertically. For the same h, the value of
semivariance is larger when tau2 is larger. And, the value of tau does
not affect the slope of changing of semivariance.
#rational_quadratic with changing sigma2
plot(h, tau2 + sigma2_values[1] * (h^2 / (1 + phi * h^2)), type = "l", col = "blue", ylim = c(0, 10),
ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing sigma2")
lines(h,tau2 + sigma2_values[2] * (h^2 / (1 + phi * h^2)), col = "purple", lwd = 1)
lines(h,tau2 + sigma2_values[3] * (h^2 / (1 + phi * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("sigma2 = 4", "sigma2 = 5", "sigma2 = 6"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
sigma2(sill) represent the asymptote where semivariance is reached. A
larger sigma2 means semivariance can reach a higher value. A smaller
sigma2 means semivariance can reach a smaller value. However,the
distance where the semivariance stabilizes does not change with
sigma2.
phi_values <- c(1, 6,20)
#rational_quadratic with changing phi
plot(h, tau2 + sigma2 * (h^2 / (1 + phi_values[1] * h^2)), type = "l", col = "blue", ylim = c(0, 10),
ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing phi")
lines(h,tau2 + sigma2 * (h^2 / (1 + phi_values[2] * h^2)), col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * (h^2 / (1 + phi_values[3] * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("phi = 1", "phi = 6", "phi = 20"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
phi(range) represents the distance where the semivariance reached the
asymptote. Beyond this distance, it is assumed that there is no
autocorrelation any more. phi Controls how quickly the semivariogram
reaches its sill. A smaller phi means the semivariogram reaches its sill
faster. Interestingly, the value of asymptote(the highest possible value
of semivariance) does change with phi in the case of rational
quadratic.
wave
tau2_values <- c(0.5, 1, 2)
sigma2_values <- c(4,5,6)
phi_values <- c(6, 10, 14)
wave <- tau2 + sigma2 * (1 - sin(phi * h) / (phi * h))
#wave with changing tau
plot(h, tau2_values[1] + sigma2 * (1 - sin(phi * h) / (phi * h)), type = "l", col = "blue", ylim = c(0, 8),
ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing tau")
lines(h, tau2_values[2] + sigma2 * (1 - sin(phi * h) / (phi * h)), col = "purple", lwd = 1)
lines(h, tau2_values[3] + sigma2 * (1 - sin(phi * h) / (phi * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("tau2 = 0.5", "tau2 = 1", "tau2 = 2"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
Similary, tau(nugget) represent the discontinuity at the origin. The
parameter tau can move the curves vertically. For the same h, the value
of semivariance is larger when tau2 is larger. And, the value of tau
does not affect the slope of changing of semivariance. And the shape of
wave does not change
#wave with changing sigma2
plot(h, tau2 + sigma2_values[1] * (1 - sin(phi * h) / (phi * h)), type = "l", col = "blue", ylim = c(0, 8),
ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing sigma2")
lines(h, tau2 + sigma2_values[2] * (1 - sin(phi * h) / (phi * h)), col = "purple", lwd = 1)
lines(h, tau2 + sigma2_values[3] * (1 - sin(phi * h) / (phi * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("sigma2 = 4", "sigma2 = 5", "sigma2 = 6"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
Similarly, sigma2(sill) represent the asymptote where semivariance is
reached. A larger sigma2 means semivariance can reach a higher value. A
smaller sigma2 means semivariance can reach a smaller value. However,the
distance where the semivariance stabilizes does not change with sigma2.
And the shape of wave does not change
#wave with changing phi
plot(h, tau2 + sigma2 * (1 - sin(phi_values[1] * h) / (phi_values[1]* h)), type = "l", col = "blue", ylim = c(0, 8),
ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing phi")
lines(h, tau2 + sigma2 * (1 - sin(phi_values[2] * h) / (phi_values[2] * h)), col = "purple", lwd = 1)
lines(h, tau2 + sigma2 * (1 - sin(phi_values[3] * h) / (phi_values[3] * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("phi = 6", "phi = 10", "phi = 14"),
col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)
Similarly, phi(range) represents the distance where the semivariance
reached the asymptote. phi Controls how quickly the semivariogram
reaches its sill. We can see that the semivariogram of the wave function
with a larger phi reaches its sill faster. Note that the value of
asymptote does not change with phi.
Part B (2)
#I use the same parameter values as question(1)
exponential <- tau2+sigma2*(1-exp(-h/phi))
spherical <- tau2 +sigma2*(1.5*(h/phi)-0.5*((h/phi)**3))
gaussain <- tau2 +sigma2*(1-exp(-(h**2)/(phi*phi)))
#c(h)= c(0)-semivarigrarm
#c(0)= data variance = sill
cov_exponential = tau2+sigma2 - exponential
cov_spherical = tau2+sigma2- spherical
cov_gaussain =tau2+sigma2 - gaussain
plot(h,cov_exponential,type = "l",ylab = "C(h)", ylim=c(0,10), xlab = "Distance h", main = "C(h) of exponential function")
plot(h,cov_spherical, type = "l",ylab = "C(h)", ylim=c(0,10),xlab = "Distance h", main = "C(h) of spherical function",)
plot(h,cov_gaussain, type = "l", ylab = "C(h)", ylim=c(0,10),xlab = "Distance h", main = "C(h) of gaussain function")
Part B (3)
# For the purpose of neatness, we use a function
matern_covariance <- function(h, kappa, phi, sigma2 = 1) {
term1 <- (sigma2 / (gamma(kappa) * (2**(kappa - 1))))
term2 <- (sqrt(2 * kappa) * (h / phi))**kappa
term3 <- besselK((sqrt(2 * kappa) * (h / phi)), kappa)
return(term1 * term2 * term3)
}
# Define parameter values
sigma2 <- 4
phi <- 6
h <- seq(0, 10, length.out = 100)
k_values <- c(0.01, 0.1, 1, 5, 10)
# Create an empty plot
plot(h, matern_covariance(h, k_values[1], sigma2, phi), type = "l", col = "blue", ylim = c(0, 6),
ylab = "C(h)", xlab = "Distance h", main = "Matern Covariance with Changing k")
lines(h, matern_covariance(h, k_values[2], sigma2, phi), col = "red")
lines(h, matern_covariance(h, k_values[3], sigma2, phi), col = "green")
lines(h, matern_covariance(h, k_values[4], sigma2, phi), col = "orange")
lines(h, matern_covariance(h, k_values[5], sigma2, phi), col = "black")
lines(h, matern_covariance(h, 10000, sigma2, phi), col = "purple")
legend("topright", legend = paste("kappa =", k_values), col = c("blue", "red", "green","orange","brown","purple"), lty = 1)
As k decrease to 0, the lines becomes a flat line. This means C(h)
decrease with a decreasing speed. The shape of line become more similar
to C(h) of exponential function When k<1, the C(0) is not equal to
sigma2. When k>1, the C(0) is equal to sigma2. As k increase, the
lines becomes steeper, which means the C(h) decrease with a increasing
speed.The shape of line become more similar to C(h) of gaussain function
However, as k increase to infinity, the line vanish, maybe due to
numeric instability in R.
Part B (4)
dorian <- read.csv(paste0(data.dir, "/dorian.csv"))
head(dorian)
## lat lon temp dew.point ceiling.ht wind.dir wind.sp atm.press
## 1 32.483 -80.717 27.76667 21.13333 22000 315.0000 2.850000 1011.000
## 2 32.550 -80.450 25.48333 21.20000 NA 298.3333 1.083333 1010.500
## 3 32.782 -79.925 25.98333 NA NA 295.0000 3.350000 1010.067
## 4 32.899 -80.041 25.65000 19.90000 22000 321.6667 4.566667 1010.150
## 5 32.900 -80.033 25.30000 19.70000 22000 320.0000 4.650000 1010.250
## 6 33.350 -79.183 24.88333 21.51667 NA 296.6667 4.700000 1007.833
## rh
## 1 68.24173
## 2 77.99672
## 3 NA
## 4 71.31048
## 5 71.89816
## 6 81.84029
dorian = dorian %>%
st_as_sf(coords=c('lon','lat'), crs=4326, remove=FALSE) %>% #set datum
st_transform(crs=32617) %>% #project to UTM 11 (meters)
mutate(x = st_coordinates(.)[,1]/1000, y = st_coordinates(.)[,2]/1000) # extract x, y in km
head(dorian)
## Simple feature collection with 6 features and 11 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 526589.8 ymin: 3594010 xmax: 669074.5 ymax: 3691563
## Projected CRS: WGS 84 / UTM zone 17N
## lat lon temp dew.point ceiling.ht wind.dir wind.sp atm.press
## 1 32.483 -80.717 27.76667 21.13333 22000 315.0000 2.850000 1011.000
## 2 32.550 -80.450 25.48333 21.20000 NA 298.3333 1.083333 1010.500
## 3 32.782 -79.925 25.98333 NA NA 295.0000 3.350000 1010.067
## 4 32.899 -80.041 25.65000 19.90000 22000 321.6667 4.566667 1010.150
## 5 32.900 -80.033 25.30000 19.70000 22000 320.0000 4.650000 1010.250
## 6 33.350 -79.183 24.88333 21.51667 NA 296.6667 4.700000 1007.833
## rh geometry x y
## 1 68.24173 POINT (526589.8 3594010) 526.5898 3594.010
## 2 77.99672 POINT (551638.3 3601535) 551.6383 3601.535
## 3 NA POINT (600670.7 3627631) 600.6707 3627.631
## 4 71.31048 POINT (589689.4 3640498) 589.6894 3640.498
## 5 71.89816 POINT (590436.7 3640616) 590.4367 3640.616
## 6 81.84029 POINT (669074.5 3691563) 669.0745 3691.563
dorian1 <- dorian %>% st_drop_geometry()
head(dorian1)
## lat lon temp dew.point ceiling.ht wind.dir wind.sp atm.press
## 1 32.483 -80.717 27.76667 21.13333 22000 315.0000 2.850000 1011.000
## 2 32.550 -80.450 25.48333 21.20000 NA 298.3333 1.083333 1010.500
## 3 32.782 -79.925 25.98333 NA NA 295.0000 3.350000 1010.067
## 4 32.899 -80.041 25.65000 19.90000 22000 321.6667 4.566667 1010.150
## 5 32.900 -80.033 25.30000 19.70000 22000 320.0000 4.650000 1010.250
## 6 33.350 -79.183 24.88333 21.51667 NA 296.6667 4.700000 1007.833
## rh x y
## 1 68.24173 526.5898 3594.010
## 2 77.99672 551.6383 3601.535
## 3 NA 600.6707 3627.631
## 4 71.31048 589.6894 3640.498
## 5 71.89816 590.4367 3640.616
## 6 81.84029 669.0745 3691.563
dorian1_wind<-as.geodata(dorian1,coords.col=c(10,11),data.col=7)
dorian1_atm<-as.geodata(dorian1,coords.col=c(10,11),data.col=8)
semi_var_wind<-variog(dorian1_wind,option="cloud")
## variog: computing omnidirectional variogram
semi_var_atm<-variog(dorian1_atm,option="cloud")
## variog: computing omnidirectional variogram
plot(semi_var_wind,xlab="Distance (h), km", main= 'semivariance of all pairs of point in terms of wind speed')
plot(semi_var_atm,xlab="Distance (h), km",main= 'semivariance of all pairs of point in terms of wind speed')
#empirical semi-variogram of wind speed
semi_var_wind_bin<-variog(dorian1_wind,uvec=seq(0,semi_var_wind$max.dist,l=20),option="bin")
## variog: computing omnidirectional variogram
plot(semi_var_wind_bin, xlab="Distance (h), km",main = "empirical semi-variogram bin plot of wind speed")
semi_var_wind_box<-variog(dorian1_wind,uvec=seq(0,semi_var_wind$max.dist,l=20),bin.cloud=T)
## variog: computing omnidirectional variogram
plot(semi_var_wind_box,bin.cloud=T,xlab="Bin", main = "empirical semi-variogram boxplot of wind speed")
#empirical semi-variogram of wind speed
semi_var_atm_bin<-variog(dorian1_atm,uvec=seq(0,semi_var_atm$max.dist,l=20),option="bin")
## variog: computing omnidirectional variogram
plot(semi_var_atm_bin, xlab="Distance (h), km", main = "empirical semi-variogram bin plot of atmosphere pressure")
semi_var_atm_box<-variog(dorian1_atm,uvec=seq(0,semi_var_atm$max.dist,l=20),bin.cloud=T)
## variog: computing omnidirectional variogram
plot(semi_var_atm_box,bin.cloud=T,xlab="Bin", main = "empirical semi-variogram boxplot of atmosphere pressure")